resultados_avaliacoes_exp01 = read_avaliacoes()
## Parsed with column specification:
## cols(
## id = col_character(),
## item = col_character(),
## municipio = col_character(),
## criterio = col_character(),
## aproach = col_character(),
## date = col_datetime(format = ""),
## valid = col_logical(),
## contNodeNumberAccess = col_double(),
## found = col_logical(),
## pathSought = col_character(),
## durationMin = col_double(),
## duration = col_double(),
## tipo_exp = col_character()
## )
resultados_avaliacoes_exp01[is.na(resultados_avaliacoes_exp01)] <- ""
gararito = read_gabaritos()
## Parsed with column specification:
## cols(
## municipio = col_character(),
## criterio = col_character(),
## item = col_character(),
## encontrado = col_logical(),
## local_encontrado = col_character(),
## local_encontrado_2 = col_character()
## )
gararito[is.na(gararito)] <- ""
empresas_portais <- readr::read_csv(here::here("data/empresas_portais.csv"))
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13], 'X14' [14], 'X15' [15], 'X16' [16],
## 'X17' [17], 'X18' [18]
## Parsed with column specification:
## cols(
## municipio = col_character(),
## link_portal_transp = col_character(),
## link_prefeitura = col_character(),
## observacoes = col_character(),
## fornecedor = col_character(),
## tipo_fornecer = col_character(),
## `Fornecedor: Gestões Anteriores` = col_character(),
## X8 = col_character(),
## X9 = col_logical(),
## X10 = col_logical(),
## X11 = col_logical(),
## X12 = col_logical(),
## X13 = col_logical(),
## X14 = col_logical(),
## X15 = col_logical(),
## X16 = col_logical(),
## X17 = col_logical(),
## X18 = col_character()
## )
Para uma avaliação ser considerada válida ela precisa conter 61 itens. Vamos desconsiderar as avaliações que não contém esse número
Vamos remover também o municÃpio de Curral de Cima que encontra-se com seu portal de transparência fora do ar.
resultados_avaliacoes_exp01 <- resultados_avaliacoes_exp01 %>%
filter(tipo_exp == 'all_itens' & (municipio != 'Curral de Cima' & municipio != 'todo'))
empresas_portais <- empresas_portais %>%
select(municipio, fornecedor)
gararito<-left_join(gararito, empresas_portais, by=c("municipio"))
# concatena os dois csv o do gabarito e avaliações do crawler
data<-left_join(resultados_avaliacoes_exp01, gararito, by=c("municipio", "item", "criterio"))
sumarise_exp01 <- data %>%
group_by(municipio, criterio, item, aproach, date) %>%
mutate(
#verifica se a avaliação foi acertiva
tp = (valid == TRUE
& valid == encontrado
#valida se no gabarito e na avaliação o item foi encontrado na mesma url
& (grepl(local_encontrado, pathSought) |
grepl(local_encontrado_2, pathSought))) | (valid == FALSE
& valid == encontrado),
fn = valid == FALSE
& encontrado == TRUE,
fp = valid == TRUE
& encontrado == FALSE
)
sumarise_exp01 %>%
datatable(options = list(pageLength = 5), rownames = FALSE, class = 'cell-border stripe')
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
metricas_result_exp01 <- sumarise_exp01 %>%
#filter(!is.na(aproach )) %>%
group_by(municipio, aproach, date) %>%
summarise(
total_itens = n(),
tp_total = sum(tp),
fn_total = sum(fn),
fp_total = sum(fp),
#cálculo das métricas
recall = tp_total/(tp_total + fn_total),
precision = tp_total/(tp_total + fp_total),
f1_score = (2*(recall*precision))/(recall+precision),
#tempo das avaliações
median_duration_min = median(durationMin),
median_duration = median(duration),
max_duration = max(duration),
max_durationMin = max(durationMin),
median_num_access_node = median(contNodeNumberAccess),
max_num_access_node = max(contNodeNumberAccess),
all_access_node = sum(contNodeNumberAccess),
combination = last(fornecedor),
tipo_exp = last(tipo_exp)
)
metricas_result_exp01 <- metricas_result_exp01 %>%
filter(total_itens == 61)
metricas_result_exp01 %>%
write_csv(here::here("data/resultados_sumarizado_exp01.csv"))
metricas_result_exp01 %>%
arrange(desc(recall))
## # A tibble: 159 x 19
## # Groups: municipio, aproach [86]
## municipio aproach date total_itens tp_total fn_total fp_total
## <chr> <chr> <dttm> <int> <int> <int> <int>
## 1 Campina … bfs 2019-11-09 23:00:06 61 57 0 4
## 2 Campina … dfs 2019-11-13 03:48:50 61 56 0 4
## 3 Alcantil bandit 2019-12-01 21:04:58 61 57 1 3
## 4 Alcantil dfs 2019-11-22 15:33:46 61 57 1 3
## 5 Alcantil dfs 2019-11-30 03:37:54 61 57 1 3
## 6 Cruz do … bandit 2019-11-19 03:30:36 61 57 1 3
## 7 Cruz do … bandit 2019-11-29 04:37:19 61 57 1 3
## 8 Cruz do … bandit 2019-12-01 21:43:44 61 57 1 3
## 9 Cruz do … dfs 2019-11-26 18:06:55 61 57 1 3
## 10 Cruz do … dfs 2019-12-01 21:04:45 61 57 1 3
## # … with 149 more rows, and 12 more variables: recall <dbl>, precision <dbl>,
## # f1_score <dbl>, median_duration_min <dbl>, median_duration <dbl>,
## # max_duration <dbl>, max_durationMin <dbl>, median_num_access_node <dbl>,
## # max_num_access_node <dbl>, all_access_node <dbl>, combination <chr>,
## # tipo_exp <chr>
metricas_result_exp01 %>%
group_by(aproach) %>%
summarise(ocorrencia = n()) %>%
ggplot(aes(y=ocorrencia, x=reorder(aproach, +(ocorrencia)))) +
geom_bar(stat = "identity", fill="#5499C7") +
ggtitle("Número de Avaliações por Abordagem") +
xlab("Abordagem") +
ylab("Número de avaliações") +
coord_flip()
metricas_result_exp01 %>%
group_by(municipio) %>%
summarise(bfs = sum(aproach == 'bfs'), dfs = sum(aproach == 'dfs'), bandit = sum(aproach == 'bandit')) %>%
arrange(desc(dfs)) %>%
datatable(options = list(pageLength = 10), rownames = FALSE, class = 'cell-border stripe')
metricas_result_exp01 %>%
select(municipio, aproach, date, recall, precision, f1_score) %>%
arrange(desc(recall)) %>%
datatable(options = list(pageLength = 10), rownames = FALSE, class = 'cell-border stripe')
metricas_result_exp01 %>%
ggplot(aes(x = aproach, y = f1_score)) +
geom_boxplot() +
geom_jitter(aes(color=aproach), alpha=0.4) +
scale_color_manual(values=c("#999999", "#f39422", "#537ec5", '#293a80')) +
labs(x='Abordagem', y="F1 Score", title="Avaliações por Abordagem")
metricas_result_exp01 %>%
group_by(aproach) %>%
ggplot(aes(x = aproach, y = f1_score)) +
geom_dotplot(aes(fill = aproach),
color='white',
binaxis = "y",
binwidth = 0.009,
stackdir = "center") +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
geom = "crossbar", width = 0.5, alpha=0.3,aes(colour='Mediana'), ) +
scale_linetype_manual("", values=c("median"="x")) +
scale_fill_manual(values=c("#999999", "#f39422", "#537ec5", '#293a80')) +
scale_colour_manual(values=c("black", "black", "#56B4E9", '#293a80')) +
labs(x='Abordagem', y="F1 Score", title="Avaliações por Abordagem", color = "")
#Calcula a media das posições escolhidas nas buscas.
set.seed(123)
f1_score_boot <- function (d, i) {
dt<-d[i,]
return(c(
median(dt$f1_score)
))
}
create_ic <- function(x) {
x <- last(x)
df.boot <- filter(metricas_result_exp01, aproach == x)
bootstrap.aproach <- boot(
data = df.boot,
statistic = f1_score_boot,
R = 4000 )
ci = tidy(bootstrap.aproach,
conf.level = .95,
conf.method = "bca",
conf.int = TRUE)
print(glimpse(ci))
return(ci)
}
ics.aproach_exp01 <- metricas_result_exp01 %>%
group_by(aproach) %>%
summarise(
median_value = median(f1_score),
ci = list(create_ic(aproach))
) %>%
unnest(ci)
## Observations: 1
## Variables: 5
## $ statistic <dbl> 0.9220551
## $ bias <dbl> -0.0005552468
## $ std.error <dbl> 0.01722969
## $ conf.low <dbl> 0.8909991
## $ conf.high <dbl> 0.9484618
## # A tibble: 1 x 5
## statistic bias std.error conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.922 -0.000555 0.0172 0.891 0.948
## Observations: 1
## Variables: 5
## $ statistic <dbl> 0.9265993
## $ bias <dbl> 0.0004114212
## $ std.error <dbl> 0.01209642
## $ conf.low <dbl> 0.9009009
## $ conf.high <dbl> 0.9484618
## # A tibble: 1 x 5
## statistic bias std.error conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.927 0.000411 0.0121 0.901 0.948
## Observations: 1
## Variables: 5
## $ statistic <dbl> 0.9107143
## $ bias <dbl> 0.001594028
## $ std.error <dbl> 0.01594229
## $ conf.low <dbl> 0.8909091
## $ conf.high <dbl> 0.9454545
## # A tibble: 1 x 5
## statistic bias std.error conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.911 0.00159 0.0159 0.891 0.945
ics.aproach_exp01 %>%
ggplot() +
geom_errorbar(aes(x = aproach, y = statistic, ymin = conf.low, ymax = conf.high), width = 0.05) +
geom_point(aes(x=aproach, y=median_value), color='red', size=3)
## Warning: Ignoring unknown aesthetics: y
metricas_result_exp01 %>%
group_by(municipio) %>%
summarise(max_value = max(f1_score), min_value = min(f1_score), median_value=median(f1_score)) %>%
ggplot(aes(y=municipio)) +
geom_point(aes(x=min_value, color='#293a80'), size=3) +
geom_point(aes(x= max_value, color='#537ec5'), size=3) +
geom_dumbbell(color="#e3e2e1", aes(x = min_value, xend = max_value), colour_x = "#293a80", colour_xend = "#537ec5", size=3,
dot_guide=TRUE, dot_guide_size=0.25) +
#geom_point(aes(x= median_value, color='#f39422'), size=6, shape=108) +
geom_point(aes(x= median_value, color='#f39422'), size=3, alpha= 0.5) +
scale_color_manual(name = "", values = c("#293a80", "#537ec5", "#f39422"), labels = c("MÃnimo", "Máximo", "Mediana")) +
labs(x='F1 Score', y=NULL, title="F1 Score Por MunicÃpio")
metricas_result_exp01 %>%
ggplot(aes(x = reorder(aproach, +(max_durationMin)), y = max_durationMin)) +
geom_boxplot()
metricas_result_exp01 %>%
ggplot(aes(x = reorder(aproach, +(max_num_access_node)), y = max_num_access_node)) +
geom_boxplot()
metricas_result_exp01 %>%
group_by(combination) %>%
summarise(max_value = max(f1_score), min_value = min(f1_score), median_value=median(f1_score)) %>%
ggplot(aes(y=combination)) +
geom_point(aes(x=min_value, color='#293a80'), size=3) +
geom_point(aes(x= max_value, color='#537ec5'), size=3) +
geom_dumbbell(color="#e3e2e1", aes(x = min_value, xend = max_value), colour_x = "#293a80", colour_xend = "#537ec5", size=3,
dot_guide=TRUE, dot_guide_size=0.25) +
geom_point(aes(x= median_value, color='#f39422'), size=3, alpha=0.7) +
scale_color_manual(name = "", values = c("#293a80", "#537ec5", "#f39422"), labels = c("MÃnimo", "Máximo", "Mediana")) +
labs(x='F1 Score', y=NULL, title="Combinações por F1 Score")
metricas_result_exp01 %>%
group_by(combination) %>%
summarise(max_value = max(max_num_access_node), min_value = min(max_num_access_node), median_value=median(max_num_access_node)) %>%
ggplot(aes(y=combination)) +
geom_point(aes(x=min_value, color='#293a80'), size=3) +
geom_point(aes(x= max_value, color='#537ec5'), size=3) +
geom_dumbbell(color="#e3e2e1", aes(x = min_value, xend = max_value), colour_x = "#293a80", colour_xend = "#537ec5", size=3,
dot_guide=TRUE, dot_guide_size=0.25) +
geom_point(aes(x= median_value, color='#f39422'), size=3, alpha=0.7) +
scale_color_manual(name = "", values = c("#293a80", "#537ec5", "#f39422"), labels = c("MÃnimo", "Máximo", "Mediana")) +
labs(x='Número de nós', y=NULL, title="Combinações Por Número de Nós Acessados")
metricas_result_exp01 %>%
group_by(combination) %>%
summarise(max_value = max(max_durationMin), min_value = min(max_durationMin), median_value=median(max_durationMin)) %>%
ggplot(aes(y=combination)) +
geom_point(aes(x=min_value, color='#293a80'), size=3) +
geom_point(aes(x= max_value, color='#537ec5'), size=3) +
geom_dumbbell(color="#e3e2e1", aes(x = min_value, xend = max_value), colour_x = "#293a80", colour_xend = "#537ec5", size=3,
dot_guide=TRUE, dot_guide_size=0.25) +
geom_point(aes(x= median_value, color='#f39422'), size=3, alpha=0.7) +
scale_color_manual(name = "", values = c("#293a80", "#537ec5", "#f39422"), labels = c("MÃnimo", "Máximo", "Mediana")) +
labs(x='Minutos', y=NULL, title="Combinações Por Tempo de Duração")